######################################
## Replication masterfile for
## "An Informed Forensics Approach to
## Detecting Vote Irregularities"
######################################
## The following reproduces all estimates
## and figures reported in our manuscript
## roughly in the order in which they appear 
## in the text.
## Due to the unseedable nature of the BART
## implementation we use, we routinely invoke 
## objects that contain the samples used to produce
## the exact quantities reported, in addition to
## providing the code that was used to produce 
## said objects. Replications based on that code
## can be expected to vary slightly with respect
## to the values reported in our manuscript due to
## sampling error. All seedable operations are seeded.
## Finally, it is important to note that some of the
## procedures used are computationally intensive, and 
## were completed using parallelized routines on a
## cluster with over 400 cores available. We
## use the MPI backend and also provide the code 
## used to submit the jobs to the LSF
## queing system used in the cluster on which computations
## took place. However, all scripts can be run using a
## different backend on a machine with fewer cores --
## with a corresponding increase in computation time, of 
## course.

## Load required libraries
library(plyr)
library(ltm)
library(psy)
library(foreach)
# library(doMPI) # To run scripts on cluster
library(doMC) # To run scripts on local machine.
library(glmnet)
library(BayesTree)
library(abind)
library(lattice)
library(latticeExtra)
library(RColorBrewer)

############################
## LOAD AUXILIARY FUNCTIONS
############################
## The sourced code invokes functions
## that are used by other scripts to perform
## data processing and generalization
## error estimation tasks.
source("./DataProcessing.R")
source("./GenErrorAuxFunc.R")


################
## LOAD DATASETS
################
## In addition to creating train and test subsets,
## we divide our dataset based on feature types,
## such that we have a forensics-only subset,
## a context-only subset, and a full set, of the 
## train and test varieties.
all.data  <- data.preproc()

#############################
## IRT FRAUD PROXY ESTIMATION
#############################
## The script produces object `fraud.pos'
## which contains IRT scores per country-election.
source("./FraudScoreIRT.R")

## Use alternative IRT measure?
## To check robustness, we estimate IRT of 
## three items (viz. nelda32, nelda48 and nelda49)
use.alt.irt  <- FALSE #Set to TRUE to use alternative IRT measure

if (use.alt.irt){
  fraud.pos$fraud.score  <-   fraud.pos$fraud.score.3item 
}
fraud.pos  <- subset(fraud.pos,select=-c(fraud.score.3item))

## Table B2:
summary(irt.model)


## Relative frequencies of `yes' answers referenced
## in footnote 34:

## Merge fraud proxy with rest of data
## and reset to matrix form, so BART can 
## process it.
all.data[1:3] <- llply(1:3,.fun=function(x){
  all.data[[x]]  <- as.data.frame(all.data[[x]])
  all.data[[x]]$country  <- all.data[[x+3]]
  all.data[[x]]
})
all.data$fData  <- merge(all.data$fData,fraud.pos,by=c("year","country"))
all.data$fData  <- as.matrix(all.data$fData[,-grep("country|year",colnames(all.data$fData))])
all.data$fDataForen  <- merge(all.data$fDataForen,fraud.pos,by=c("year","country"))
all.data$fDataForen  <- as.matrix(all.data$fDataForen[,-grep("country|year",colnames(all.data$fDataForen))])
all.data$fDataInform  <- merge(all.data$fDataInform,fraud.pos,by=c("year","country"))
all.data$fDataInform  <- as.matrix(all.data$fDataInform[,-grep("country|year",colnames(all.data$fDataInform))])
all.data$fDataForMerge <- merge(all.data$fDataForMerge,fraud.pos,by=c("year","country"))


## Frequency, Relative frequency, and country-years of IRT scores presented in
## Table 1:
table(all.data$fData[,"fraud.score"])
round(prop.table(table(all.data$fData[,"fraud.score"]))*100,2)
limits  <-rbind(c(2.1,1.9),c(1.8,1.7),c(1.7,1.66),c(1.65,1.63)
             ,c(1.63,1.61),c(1.57,1.55),c(1.54,1.52)
             ,c(1.20,1.18),c(1.11,1.09),c(1.06,1.05)
             ,c(1.05,1.03),c(.75,.73),c(.59,.56),c(.50,.48)
             ,c(.37,.34),c(0,-2))
for(i in 1:nrow(limits)){
  cat("Example scores from",limits[i,2],"to",limits[i,1],":\n")
  print(head(all.data$fDataForMerge[all.data$fDataForMerge$fraud.score<limits[i,1] & all.data$fDataForMerge$fraud.score>limits[i,2]
                          ,c("country", "year", "fraud.score")]))
}

## Save data so other subroutines can access it

save(all.data,file="./FullData.RData")

###########################
## BART PARAMETER SELECTION
###########################
## The script either submits a job to the LSF queueing
## system that manages cluster resources (what was actually
## done to obtain the parameters used in the manuscript) or
## conducts a parameter sweep using a bootstrap, depending
## on logical flag passed as argument to Rscript. Currently set 
## to TRUE. The output of said routine is a csv loaded in line 141.
## Use discretion when uncommenting.
#system("Rscript ./ParameterSelection/BootstrapParamSweep.R TRUE")
#Sys.sleep(14400) #All else depends on these results; hence, we must wait.

(best.params <- read.csv("./BSParamSweep.csv"
                          ,nrows=1)[,1:6])
best.params$burnin  <- 50e3
best.params$niter <- 5e3
best.params$keepevery <- 10
best.params$sampind  <- 1


#########################
## GENERALIZATION ERROR
#########################
## Table 2:
## The following script reestimates BART models that produce
## the results reported in Table 2 of the manuscript. However,
## in the interest of exact replication, the script simply
## loads results reported in the manuscript. Those interested
## in reestimating the models should uncomment lines 30-40
## in GenErrorTestSet.R. 
source("./GenErrorTestSet.R")

## Specificity & Sensitivity:
source("./SpecSens.R")

## 15-fold Cross-validation gen. error:
## There is one script for each model: full, foren, and
## contextual features only. Lines 170-172 estimate models
## using cluster (flag set to TRUE). With these commented out,
## however, line 174 loads and operates on the data used in the
## original manuscript.
#system("Rscript ./GenErrorKFoldFull.R TRUE")
#system("Rscript ./GenErrorKFoldForen.R TRUE")
#system("Rscript ./GenErrorKFoldContext.R TRUE")
#Analysis reports (in Appendix E):
source("/kFold/kFoldAnalysis.R")

## LOO Bootstrap gen. error:
## Again, one script per model. These models require
## substantially more computing resources to estimate
## so again we resort to the computer cluster. Lines
## 185-187 estimate models on a cluster using LSF
## queueing system (flag set to TRUE). With these commented out,
## however, line 189 loads and operates on the data used in the
## original manuscript.
## Use discretion when uncommenting the following lines.
#system("Rscript ./GenErrorBootFull.R TRUE")
#system("Rscript ./GenErrorBootForen.R TRUE")
#system("Rscript ./GenErrorBootContext.R TRUE")
#Produce Table B3 (in Appendix E):
source("BootGenError.R")

#########################################
## INCLUSION PROBABILITY PLOT (Figure 3)
#########################################
## The BART implementation we use cannot be seeded.
## As a result, the following script loads results 
## used for producing the plot presented in the manuscript.
## However, it contains the commands needed to reestimate the
## BART model (see lines 19-34 of InclProbPlots.R), for
## replication purposes.
source("./InclProbPlots.R")

#######################################
## EXTERNAL VALIDATION PLOT (Figure 7)
#######################################
source("./ExtValidationPlots.R")

##################################################
## SIMPLE PARTIAL DEPENDENCY PLOTS (Figures 4 & 5)
##################################################
## Generating partial dependency plots using the BART
## implementation we used requires reestimation of the
## model for every plotted variable. Hence, we split
## the generation of these graphs into model estimation
## and graph making.
## Model estimation is parallelized to run each model on
## a different core. Since there aren't that many variables,
## we use a desktop computer with 20 available cores and
## use the MC parallel backend. Lines 38-61 of PartialDepPlots.R
## allows reestimation of said models, though we comment them
## and simply load results used in our original manuscript for
## exact replicability purposes. Those interested in reestimation
## are welcome to uncomment those lines.
source("./PartialDepPlots.R")
## Graph generation:
source("./ResultsVisual.R")

###############################################
## TWO-WAY PARTIAL DEPENDENCY PLOT (Figure 6)
###############################################
## Once again, interaction partial dependencies require
## reestimation of the BART model. In addition, and given
## the amount of points on the grid on which the partial
## dependency function is evaluated, a large amount of
## memory is required for estimation. We resort to the 
## a computer with 192Gb of RAM for this task. The following
## script (l. 241) reestimates the BART needed to estimate the 
## partial dependency function using the 192Gb computer (flag set
## to TRUE); Set to FALSE to run on another computer.
## Uncomment with care. Line 242 loads results used in the original
## manuscript.
#system("Rscript ./InteractionPlots.R TRUE")
load("./BartPlotObjectsInter.RData")
## Graph generation:
source("./ResultsVisual3d.R")

##################################
## BART ILLUSTRATION IN APPENDIX F
##################################
source("./BartIllustration.R")


